{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os,ogr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "eqcsv = os.getcwd()+\"\\\\data\\\\eq2013.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = sc.textFile(eqcsv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8205"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['date,time,x,y,z,lv,M,v1,v2,v3,v4',\n",
       " '2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:02,-178.08,51.127,26,3.1,ML3.1,AEIC,,ANDREANOF ISLANDS, ALEUTIAN IS.',\n",
       " '2013/5/31,22:21:56,143.032,21.648,322.5,4.4,,,,MARIANA ISLANDS REGION,',\n",
       " '2013/5/31,21:17:52,-64.754,19.103,36.9,2.9,MD2.9,RSPR,,VIRGIN ISLANDS,',\n",
       " '2013/5/31,20:17:11,161.56,-10.28,58.8,4.7,,,,SOLOMON ISLANDS,',\n",
       " '2013/5/31,19:51:33,-121.068,40.148,6.6,2.2,MD2.2,NC,,NORTHERN CALIFORNIA,']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd.take(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 编写一个过滤的方法。\n",
    "### 原理是用xy两个字段来构造一个点要素，然后用gdal的IsValid方法来验证，如果能用，就表示是这一行是一个合法的要素描述信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def isPoint(line):\n",
    "    pntline = line.split(\",\")\n",
    "    try:\n",
    "        wkt = \"POINT({0} {1})\".format(float(pntline[2]),float(pntline[2]))\n",
    "        geom = ogr.CreateGeometryFromWkt(wkt)\n",
    "        return geom.IsValid()\n",
    "    except:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "eqrdd = rdd.filter(lambda line : isPoint(line))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 执行过滤之后，发现减少了一行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8204"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eqrdd.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2013/5/31,23:41:56,-113.408,37.175,6.6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,23:09:05,-113.411,37.178,6,2.5,ML2.5,SLC,,UTAH,',\n",
       " '2013/5/31,22:45:34,-113.413,37.172,4,2.9,ML2.9,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:26,-113.414,37.174,3.2,2.8,ML2.8,SLC,,UTAH,',\n",
       " '2013/5/31,22:34:02,-178.08,51.127,26,3.1,ML3.1,AEIC,,ANDREANOF ISLANDS, ALEUTIAN IS.',\n",
       " '2013/5/31,22:21:56,143.032,21.648,322.5,4.4,,,,MARIANA ISLANDS REGION,',\n",
       " '2013/5/31,21:17:52,-64.754,19.103,36.9,2.9,MD2.9,RSPR,,VIRGIN ISLANDS,',\n",
       " '2013/5/31,20:17:11,161.56,-10.28,58.8,4.7,,,,SOLOMON ISLANDS,',\n",
       " '2013/5/31,19:51:33,-121.068,40.148,6.6,2.2,MD2.2,NC,,NORTHERN CALIFORNIA,',\n",
       " '2013/5/31,19:29:31,-67.355,18.185,12,2.7,MD2.7,RSPR,,MONA PASSAGE,']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eqrdd.take(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
